capture log close
clear all
set more off

* Stablish Working Directory ***************************************************
cd "$workdirectory"


*-------------------------------------------------------------------------------
* Canada - For Canada we count industry shares by metro area (METROCA) and then match it directly to the FUA dataset

* Importing Files

import delimited raw_datasets\Maps\individual_countries\canada\canada_1981.csv, varnames(10) rowrange(11) clear
gen country_year = "canada_1981"
save "processed_datasets\maps_and_cities_canada.dta", replace

import delimited raw_datasets\Maps\individual_countries\canada\canada_1991.csv, varnames(10) rowrange(11) clear
gen country_year = "canada_1991"
append using "processed_datasets\maps_and_cities_canada.dta"
save "processed_datasets\maps_and_cities_canada.dta", replace

import delimited raw_datasets\Maps\individual_countries\canada\canada_2001.csv, varnames(10) rowrange(11) clear
gen country_year = "canada_2001"
append using "processed_datasets\maps_and_cities_canada.dta"
save "processed_datasets\maps_and_cities_canada.dta", replace

* Cleaning
keep if v1 == "Weighted N"
drop v1
drop if v2 == "0 Not in a metropolitan area"
drop if v2 == "COL TOTAL"

* Genearting efua_id variable
* Notes: Windsor is identified as "Detroit" in the FUA dataset
* Sudbury and Thunder Bay; Regina and Saskatoon; Sherbrooke and Trois-RiviÃ¨res; are dropped as they are merged in the indgen dataset but are not a single FUA
* Oshawa and Hamilton belong to Toronto in the FUA dataset

gen efua_id = 0
replace efua_id = 3865 if v2 == "1 Halifax"
replace efua_id = 2360 if v2 == "10 Kitchener"
replace efua_id = 2220 if v2 == "11 London"
replace efua_id = 1996 if v2 == "12 Windsor"
replace efua_id = 1837 if v2 == "14 Winnipeg"
replace efua_id = 1276 if v2 == "16 Calgary"
replace efua_id = 1572 if v2 == "17 Edmonton"
replace efua_id = 375 if v2 == "18 Vancouver"
replace efua_id = 28 if v2 == "19 Victoria"
replace efua_id = 3789 if v2 == "2 QuÃ©bec"
replace efua_id = 3485 if v2 == "3 MontrÃ©al"
replace efua_id = 3343 if v2 == "5 Ottawa-Hull"
replace efua_id = 2626 if v2 == "6 Oshawa"
replace efua_id = 2626 if v2 == "7 Toronto"
replace efua_id = 2626 if v2 == "8 Hamilton"
replace efua_id = 2751 if v2 == "9 St. Catharines-Niagara"

drop if v2 == "13 Sudbury and Thunder Bay"
drop if v2 == "15 Regina and Saskatoon"
drop if v2 == "4 Sherbrooke and Trois-RiviÃ¨res"

drop v2

* We now collapse to solve the issue of multiple cities belonging to Toronto in the FUA dataset
collapse (sum) niunotinuniverse agriculturefishingandforestry miningandextraction manufacturing construction wholesaleandretailtrade hotelsandrestaurants transportationstorageandcommunic financialservicesandinsurance publicadministrationanddefense businessservicesandrealestate education healthandsocialwork otherservices unknown rowtotal electricitygaswaterandwastemanag, by(efua_id country_year)

save "processed_datasets\maps_and_cities_canada.dta", replace

* Adding FUA population

import delimited raw_datasets\Maps\intersection.csv, clear
keep if cntry_na_1 == "Canada"
collapse (mean) fua_p_2015 (first) efua_name, by(efua_id)

tempfile can
save `can'

use "processed_datasets\maps_and_cities_canada.dta", clear
merge m:1 efua_id using `can'
keep if _merge == 3
drop _merge

* Renaming industries

rename niunotinuniverse 						niu
rename agriculturefishingandforestry 			agri
rename miningandextraction 						mining
rename manufacturing 							mfg
rename electricitygaswaterandwastemanag 		utilities
rename construction 							construction
rename wholesaleandretailtrade 					trade
rename hotelsandrestaurants 					hospitality
rename transportationstorageandcommunic 		transport
rename financialservicesandinsurance 			fin_insu
rename publicadministrationanddefense 			govmt
rename businessservicesandrealestate 			bussserv_rs
rename education 								educ
rename healthandsocialwork 						health
rename otherservices 							other_serv
// rename privatehouseholdservices 				house_serv
rename unknown 									unknown
rename rowtotal 								rowtotal	
// rename otherindustrynec 						other_industry
// rename servicesnotspecified 					serv_notsp
// rename responsesuppressed						resp_supressed

* Formating to fit main dataset
foreach v of varlist niu agri mining mfg utilities construction trade hospitality transport fin_insu govmt bussserv_rs educ health other_serv unknown rowtotal{
	gen _`v' = `v'
}

drop niu agri mining mfg construction trade hospitality transport fin_insu govmt bussserv_rs educ health other_serv unknown rowtotal utilities

* Replacing with 0 when missing
foreach v3 of varlist _niu _agri _mining _mfg _utilities _construction _trade _hospitality _transport _fin_insu _govmt _bussserv_rs _educ _health _other_serv _unknown _rowtotal{
	replace `v3' = 0 if `v3' == .
}

* Calculating shares
foreach v2 of varlist _agri _mining _mfg _utilities _construction _trade _hospitality _transport _fin_insu _govmt _bussserv_rs _educ _health _other_serv _unknown{
	gen share`v2' = `v2'/(_rowtotal-_niu)
}

* We create 6 variables for industries that don't exists in these countries
* census, so that they don't appear with *missing* when appending
gen _other_industry 		= 0
gen _serv_notsp   			= 0
gen _resp_supressed 		= 0
gen _house_serv				= 0
gen share_other_industry 	= 0
gen share_serv_notsp		= 0
gen share_resp_supressed	= 0
gen share_house_serv		= 0

save "processed_datasets\maps_and_cities_canada.dta", replace


*-------------------------------------------------------------------------------
* Philippines (2000) - For Philippines we count industry shares by city (CITYPH) and then match it directly to the FUA dataset

import delimited raw_datasets\Maps\individual_countries\philippines\philippines_2000.csv, varnames(10) rowrange(11) clear
gen country_year = "philippines_2000"
save "processed_datasets\maps_and_cities_philippines.dta", replace

* Cleaning
keep if v1 == "Weighted N"
drop v1
drop if v2 == "99 Not in an identified city"
drop if v2 == "COL TOTAL"

* Genearting efua_id variable
* Notes: 
* Mandaue belongs to Cebu's FUA
* Mandaluyong; Marikina; Pasig; Valenzuela; Kalookan; Las PiÃ±as; Makati; Muntinlupa ; ParaÃ±aque; Pasay belong to Manila's FUA

gen efua_id = 0
replace efua_id = 4852 if v2 == "1 Butuan"
replace efua_id = 999 if v2 == "2 Baguio"
replace efua_id = 4372 if v2 == "3 Cebu"
replace efua_id = 4372 if v2 == "4 Mandaue"
replace efua_id = 4985 if v2 == "5 Davao"
replace efua_id = 3888 if v2 == "6 Iloilo"
replace efua_id = 4634 if v2 == "7 Iligan"
replace efua_id = 4750 if v2 == "8 Cagayan De Oro"
replace efua_id = 4101 if v2 == "9 Bacolod"
replace efua_id = 1797 if v2 == "10 Angeles"
replace efua_id = 3687 if v2 == "11 Lucena"
replace efua_id = 4910 if v2 == "12 General Santos"
replace efua_id = 1528 if v2 == "13 Olongapo"
replace efua_id = 4067 if v2 == "14 Zamboanga"
replace efua_id = 2845 if v2 == "15 Mandaluyong"
replace efua_id = 2845 if v2 == "16 Marikina"
replace efua_id = 2845 if v2 == "17 Pasig"
replace efua_id = 2845 if v2 == "18 Quezon"
replace efua_id = 2845 if v2 == "19 Valenzuela"
replace efua_id = 2845 if v2 == "20 Kalookan"
replace efua_id = 2845 if v2 == "21 Las PiÃ±as"
replace efua_id = 2845 if v2 == "22 Makati"
replace efua_id = 2845 if v2 == "23 Muntinlupa"
replace efua_id = 2845 if v2 == "24 ParaÃ±aque"
replace efua_id = 2845 if v2 == "25 Pasay"
replace efua_id = 4722 if v2 == "26 Cotabato"
replace efua_id = 4693 if v2 == "27 Marawi"
replace efua_id = 2845 if v2 == "28 Manila"

drop v2

* We now collapse to solve the issue of multiple cities belonging to a single FUA
collapse (sum) niunotinuniverse agriculturefishingandforestry miningandextraction manufacturing electricitygaswaterandwastemanag construction wholesaleandretailtrade hotelsandrestaurants transportationstorageandcommunic financialservicesandinsurance publicadministrationanddefense businessservicesandrealestate education healthandsocialwork otherservices privatehouseholdservices unknown rowtotal, by(efua_id country_year)

save "processed_datasets\maps_and_cities_philippines.dta", replace

* Adding FUA population
import delimited raw_datasets\Maps\intersection.csv, clear
keep if cntry_na_1 == "Philippines"
collapse (mean) fua_p_2015 (first) efua_name, by(efua_id)

tempfile plp
save `plp'

use "processed_datasets\maps_and_cities_philippines.dta", clear
merge m:1 efua_id using `plp'
keep if _merge == 3
drop _merge

* Renaming industries

rename niunotinuniverse 						niu
rename agriculturefishingandforestry 			agri
rename miningandextraction 						mining
rename manufacturing 							mfg
rename electricitygaswaterandwastemanag 		utilities
rename construction 							construction
rename wholesaleandretailtrade 					trade
rename hotelsandrestaurants 					hospitality
rename transportationstorageandcommunic 		transport
rename financialservicesandinsurance 			fin_insu
rename publicadministrationanddefense 			govmt
rename businessservicesandrealestate 			bussserv_rs
rename education 								educ
rename healthandsocialwork 						health
rename otherservices 							other_serv
rename privatehouseholdservices 				house_serv
rename unknown 									unknown
rename rowtotal 								rowtotal	
// rename otherindustrynec 						other_industry
// rename servicesnotspecified 					serv_notsp
// rename responsesuppressed						resp_supressed

* Formating to fit main dataset
foreach v of varlist niu agri mining mfg utilities construction trade hospitality transport fin_insu govmt bussserv_rs educ health other_serv house_serv unknown rowtotal{
	gen _`v' = `v'
}

drop niu agri mining mfg utilities construction trade hospitality transport fin_insu govmt bussserv_rs educ health other_serv house_serv unknown rowtotal

* Replacing with 0 when missing
foreach v3 of varlist _niu _agri _mining _mfg _utilities _construction _trade _hospitality _transport _fin_insu _govmt _bussserv_rs _educ _health _other_serv _house_serv _unknown _rowtotal{
	replace `v3' = 0 if `v3' == .
}

* Calculating shares
foreach v2 of varlist _agri _mining _mfg _utilities _construction _trade _hospitality _transport _fin_insu _govmt _bussserv_rs _educ _health _other_serv _house_serv _unknown{
	gen share`v2' = `v2'/(_rowtotal-_niu)
}

* We create 6 variables for industries that don't exists in these countries
* census, so that they don't appear with *missing* when appending
gen _other_industry 		= 0
gen _serv_notsp   			= 0
gen _resp_supressed 		= 0
gen share_other_industry 	= 0
gen share_serv_notsp		= 0
gen share_resp_supressed	= 0

save "processed_datasets\maps_and_cities_philippines.dta", replace


*-------------------------------------------------------------------------------
* United States

* Importing intersection of FUAs and districts

import delimited raw_datasets\Maps\intersection_usa.csv, encoding(UTF-8) clear
save "processed_datasets\maps_and_cities_usa.dta", replace

* Importing area of districts
* We drop observations without geolevel2

import delimited raw_datasets\Maps\area_subdist_usa.csv, encoding(UTF-8) clear
keep geolevel2 area_subd
drop if geolevel2 == .
tempfile area_usa
save `area_usa', replace

* Merging and obtaining shares
use "processed_datasets\maps_and_cities_usa.dta", clear
merge m:1 geolevel2 using `area_usa'
keep if _merge == 3
drop _merge

gen share = area_calcu/area_subd
sort geolevel2
save "processed_datasets\maps_and_cities_usa.dta", replace

********************************************************************************
* Generating industry per city data

* Generating empty file to attach data to
clear all
tempfile data_indgen_cities_usa
save `data_indgen_cities_usa', empty replace

* Genearting local with all .csv in the folder
local files : dir "raw_datasets\Maps\individual_countries\usa" files "*.csv"

cd raw_datasets\Maps\individual_countries\usa

* Importing, appending and saving
foreach fi in `files' {
  import delimited `fi', varnames(11) rowrange(12) clear
  gen country_year = subinstr("`fi'",".csv","",.)
  append using `data_indgen_cities_usa'
  save `data_indgen_cities_usa', replace
}

* Cleaning
keep if v1 == "Weighted N"
drop v1
rename v2 census
drop if census == "COL TOTAL"

* Exctracting geolevel1
gen geolevel2 = regexs(0) if(regexm(census, "[0-9]+"))
destring geolevel2, replace

sort geolevel2
save `data_indgen_cities_usa', replace


*-------------------------------------------------------------------------------

cd ..\..\..\..

* Joining both datasets
use "processed_datasets\maps_and_cities_usa.dta", clear
joinby geolevel2 using `data_indgen_cities_usa'

* Renaming industries

rename niunotinuniverse 						niu
rename agriculturefishingandforestry 			agri
rename miningandextraction 						mining
rename manufacturing 							mfg
rename electricitygaswaterandwastemanag 		utilities
rename construction 							construction
rename wholesaleandretailtrade 					trade
rename hotelsandrestaurants 					hospitality
rename transportationstorageandcommunic 		transport
rename financialservicesandinsurance 			fin_insu
rename publicadministrationanddefense 			govmt
rename businessservicesandrealestate 			bussserv_rs
rename education 								educ
rename healthandsocialwork 						health
rename otherservices 							other_serv
rename privatehouseholdservices 				house_serv
// rename unknown 									unknown
rename rowtotal 								rowtotal	
// rename otherindustrynec 						other_industry
// rename servicesnotspecified 					serv_notsp
// rename responsesuppressed						resp_supressed

* Applying Shares
foreach v of varlist niu agri mining mfg utilities construction trade hospitality transport fin_insu govmt bussserv_rs educ health other_serv house_serv rowtotal {
	gen _`v' = `v'*share
}

* Fixing country names from both datasets
replace cntry_na_1 = "United States" if cntry_na_1 == "UnitedStates"

* Drop intersections which associate districts from country i to a city located in country j
drop if cntry_name != cntry_na_1

* Collapsing by city and census
collapse (sum) _* (mean) fua_p_2015 (first) efua_name ,by(efua_id country_year)

foreach v2 of varlist _agri _mining _mfg _utilities _construction _trade _hospitality _transport _fin_insu _govmt _bussserv_rs _educ _health _other_serv _house_serv {
	gen share`v2' = `v2'/(_rowtotal-_niu)
}

* We create 6 variables for industries that don't exists in these countries
* census, so that they don't appear with *missing* when appending
gen _other_industry 		= 0
gen _serv_notsp   			= 0
gen _resp_supressed 		= 0
gen _unknown				= 0
gen share_other_industry 	= 0
gen share_serv_notsp		= 0
gen share_resp_supressed	= 0
gen share_unknown			= 0

save "processed_datasets\maps_and_cities_usa.dta", replace


*-------------------------------------------------------------------------------
* Italy - For Italy we count industry shares by city and then match it directly to the FUA dataset

use raw_datasets\Maps\individual_countries\italy\ipumsi_00014, clear

* We add the code for each different year into a single variable
egen city = rowtotal(it2014h_gracom it2015h_gracom it2016h_gracom it2017h_gracom it2018h_gracom it2019h_gracom)

* Keeping only the variables we need
keep year perwt indgen city

* Dropping non-cities
drop if city == 99999

* Genearting efua_id variable
gen efua_id = 0
replace efua_id = 85 	if city == 1272
replace efua_id = 1309 	if city == 10025
replace efua_id = 1600 	if city == 15146
replace efua_id = 2772 	if city == 23091
replace efua_id = 3592 	if city == 27042
replace efua_id = 3157 	if city == 37006
replace efua_id = 3106 	if city == 48017
replace efua_id = 3881	if city == 58091
replace efua_id = 4333 	if city == 63049
replace efua_id = 5004	if city == 72006
replace efua_id = 4232 	if city == 82053
replace efua_id = 4771 	if city == 83048
replace efua_id = 4658 	if city == 87015
drop city

* Genearting country_year variable
gen country_year = ""
foreach y of numlist 2014/2019{
    replace country_year = "italy_`y'" if year == `y'
}
drop year

* Summing perwt at year, indgen and efua_id level
collapse (sum) perwt, by(country_year indgen efua_id)

* Reshaping to fit main dataset format
sort country_year efua_id
reshape wide perwt, i(country_year efua_id) j(indgen)

* Renaming to match with main dataset
rename perwt0		niunotinuniverse	
rename perwt10		agriculturefishingandforestry 
rename perwt20		miningandextraction
rename perwt30		manufacturing 
rename perwt40		electricitygaswaterandwastemanag 
rename perwt50		construction 
rename perwt60		wholesaleandretailtrade
rename perwt70		hotelsandrestaurants
rename perwt80		transportationstorageandcommunic
rename perwt90		financialservicesandinsurance 
rename perwt100		publicadministrationanddefense
rename perwt111		businessservicesandrealestate
rename perwt112		education 
rename perwt113		healthandsocialwork 
rename perwt114		otherservices 
rename perwt120		privatehouseholdservices
rename perwt130		otherindustrynec	
// rename perwt999		unknown


* Generating Row Total variables
egen rowtotal = rowtotal(niunotinuniverse agriculturefishingandforestry miningandextraction manufacturing electricitygaswaterandwastemanag construction wholesaleandretailtrade hotelsandrestaurants transportationstorageandcommunic financialservicesandinsurance publicadministrationanddefense businessservicesandrealestate education healthandsocialwork otherservices privatehouseholdservices otherindustrynec)

* Renaming again
rename niunotinuniverse 						niu
rename agriculturefishingandforestry 			agri
rename miningandextraction 						mining
rename manufacturing 							mfg
rename electricitygaswaterandwastemanag 		utilities
rename construction 							construction
rename wholesaleandretailtrade 					trade
rename hotelsandrestaurants 					hospitality
rename transportationstorageandcommunic 		transport
rename financialservicesandinsurance 			fin_insu
rename publicadministrationanddefense 			govmt
rename businessservicesandrealestate 			bussserv_rs
rename education 								educ
rename healthandsocialwork 						health
rename otherservices 							other_serv
rename privatehouseholdservices 				house_serv
*rename unknown 									unknown
rename rowtotal 								rowtotal	
rename otherindustrynec 						other_industry
*rename servicesnotspecified 					serv_notsp
*rename responsesuppressed						resp_supressed

* Formating to fit main dataset
foreach v of varlist niu agri mining mfg utilities construction trade hospitality transport fin_insu govmt bussserv_rs educ health other_serv house_serv other_industry rowtotal{
	gen _`v' = `v'
}

drop niu agri mining mfg utilities construction trade hospitality transport fin_insu govmt bussserv_rs educ health other_serv house_serv other_industry rowtotal

* Replacing with 0 when missing
foreach v3 of varlist _niu _agri _mining _mfg _utilities _construction _trade _hospitality _transport _fin_insu _govmt _bussserv_rs _educ _health _other_serv _house_serv _other_industry _rowtotal{
	replace `v3' = 0 if `v3' == .
}

* Calculating shares
foreach v2 of varlist _agri _mining _mfg _utilities _construction _trade _hospitality _transport _fin_insu _govmt _bussserv_rs _educ _health _other_serv _house_serv _other_industry{
	gen share`v2' = `v2'/(_rowtotal-_niu)
}

save "processed_datasets\maps_and_cities_italy.dta", replace


* Adding FUA population
import delimited raw_datasets\Maps\intersection.csv, clear
keep if cntry_na_1 == "Italy"
collapse (mean) fua_p_2015 (first) efua_name, by(efua_id)

tempfile ita
save `ita'

use "processed_datasets\maps_and_cities_italy.dta", clear
merge m:1 efua_id using `ita'
keep if _merge == 3
drop _merge

* We create 6 variables for industries that don't exists in these countries
* census, so that they don't appear with *missing* when appending
gen _serv_notsp   			= 0
gen _resp_supressed 		= 0
gen _unknown				= 0
gen share_serv_notsp		= 0
gen share_resp_supressed	= 0
gen share_unknown			= 0

save "processed_datasets\maps_and_cities_italy.dta", replace
